4  Explore Sub-Daily Data

Purpose: Explore data at sub-daily temporal resolutions (e.g., 15-min and 1-hour time steps).

4.1 Data

4.1.1 Load data

Bring in site info and sub-daily data

Code
# site information and locations
siteinfo <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_SiteInformation.csv")
siteinfo_sp <- st_as_sf(siteinfo, coords = c("long", "lat"), crs = 4326)
mapview(siteinfo_sp, zcol = "designation")
Code
# flow/yield (and temp) data 
dat_sub <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_FlowTempData_Raw.csv") %>%
  filter(!site_name %in% c("WoundedBuckCreek", "Brackett Creek"))

dat_little <- dat_sub %>% 
  filter(site_name %in% c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook")) %>% 
  select(site_name, datetime, flow, area_sqmi)

Download 15-min NWIS data for big G (West Brook NWIS)

Code
wbnwis <- tibble(readNWISdata(sites = "01171100", service = "uv", startDate = "1980-01-01", endDate = Sys.Date(), tz = "America/New_York"))
wbnwis2 <- wbnwis[,c(2,3,4)]
names(wbnwis2) <- c("station_no", "datetime", "flow") 
dat_big <- wbnwis2 %>% 
  left_join(siteinfo %>% filter(site_name == "West Brook NWIS")) %>% 
  select(site_name, datetime, flow, area_sqmi)

Organize 15-min and 1-hour datasets, load daily data

Code
dat_15min <- bind_rows(dat_little, dat_big) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS"))) %>%
  mutate(flow_cms = flow*0.02831683199881, area_sqkm = area_sqmi*2.58999) %>%
  mutate(yield = flow_cms * 900 * (1/(area_sqkm)) * (1/1000000) * 1000)


dat_1hr <- bind_rows(dat_little, dat_big) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS"))) %>%
  filter(!is.na(flow)) %>% 
  mutate(datetime = floor_date(datetime, unit = "hour")) %>%
  group_by(site_name, datetime) %>% 
  summarise(flow = mean(flow), area_sqmi = unique(area_sqmi)) %>%
  ungroup() %>%
  mutate(flow_cms = flow*0.02831683199881, area_sqkm = area_sqmi*2.58999) %>%
  mutate(yield = flow_cms * 3600 * (1/(area_sqkm)) * (1/1000000) * 1000)

dat_1day <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_FlowTempData_DailyWeekly.csv") %>%
  filter(site_name %in% c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS")) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS")))

4.1.2 View 15-min data

Plot 15 min time series data

Code
dat_15min %>% select(datetime, site_name, yield) %>% spread(key = site_name, value = yield) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)") %>% dyOptions(colors = c(hcl.colors(9, "Zissou 1"), "black")) %>% dyHighlight() 

4.1.3 View 1-hour data

Plot 1-hour time series data

Code
dat_1hr %>% select(datetime, site_name, yield) %>% spread(key = site_name, value = yield) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)") %>% dyOptions(colors = c(hcl.colors(9, "Zissou 1"), "black")) %>% dyHighlight() 

4.1.4 View 1-day data

Plot 1-day/daily time series data

Code
dat_1day %>% select(date, site_name, Yield_filled_mm) %>% spread(key = site_name, value = Yield_filled_mm) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)") %>% dyOptions(colors = c(hcl.colors(9, "Zissou 1"), "black")) %>% dyHighlight() 

4.2 Event pairing

Conduct event pairing using hydroEvents package to understand lag time between peak flows at big and little g’s

4.2.1 Hourly data

4.2.1.1 Conduct event pairing

Code
# baseflow separation and event delineation parameters
alp <- 0.925
numpass <- 9
thresh <- 0.5

# sites
sites <- unique(dat_1hr$site_name)[1:9]

# empty list to store output
outlist <- list()

for (j in 1:length(sites)) {
  # grab big and little g data and combine into a single tibble
  littleg <- dat_1hr %>% filter(site_name == sites[j])
  bigg <- dat_1hr %>% filter(site_name == "West Brook NWIS")
  mytib <- bigg %>% select(datetime, yield) %>% rename(yield_big = yield) %>% left_join(littleg %>% select(datetime, yield) %>% rename(yield_little = yield))
  
  # baseflow separation
  mytib_bf <- mytib %>% 
  filter(!is.na(yield_big), !is.na(yield_little)) %>% 
  mutate(bf_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bf, 
         bfi_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bfi,
         bf_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bf, 
         bfi_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bfi)
  
  # delineate events
  events_little <- eventBaseflow(mytib_bf$yield_little, BFI_Th = thresh, bfi = mytib_bf$bfi_little)
  events_big <- eventBaseflow(mytib_bf$yield_big, BFI_Th = thresh, bfi = mytib_bf$bfi_big)
  
  # event matching
  mypairs <- pairEvents(events_little, events_big, lag = 5, type = 1)
  mypairs_com <- mypairs[complete.cases(mypairs),]
  
  # get matched event info
  matchpeaktib <- tibble(datetime_little = rep(NA, times = dim(mypairs_com)[1]), datetime_big = rep(NA, times = dim(mypairs_com)[1]),
                         yield_little = rep(NA, times = dim(mypairs_com)[1]), yield_big = rep(NA, times = dim(mypairs_com)[1]))
  for (i in 1:dim(mypairs_com)[1]) {
    matchpeaktib$datetime_little[i] <- mytib_bf$datetime[events_little$which.max[events_little$srt == mypairs_com$srt[i]]]
    matchpeaktib$datetime_big[i] <- mytib_bf$datetime[events_big$which.max[events_big$srt == mypairs_com$matched.srt[i]]]
    matchpeaktib$yield_little[i] <- events_little$max[events_little$srt == mypairs_com$srt[i]]
    matchpeaktib$yield_big[i] <- events_big$max[events_big$srt == mypairs_com$matched.srt[i]]
    }
  matchpeaktib <- matchpeaktib %>% mutate(datetime_little = as_datetime(datetime_little),
                                          datetime_big = as_datetime(datetime_big),
                                          timediff_hrs = as.numeric(difftime(datetime_big, datetime_little), units = "hours"),
                                          site_name = sites[j])
  
  # store output in list
  outlist[[j]] <- matchpeaktib
}
matchpeaktib <- do.call(rbind, outlist)
(matchpeaktib)
# A tibble: 515 × 6
   datetime_little     datetime_big        yield_little yield_big timediff_hrs
   <dttm>              <dttm>                     <dbl>     <dbl>        <dbl>
 1 2020-02-07 17:00:00 2020-02-07 23:00:00        0.150    0.155             6
 2 2020-02-21 10:00:00 2020-02-21 18:00:00        0.111    0.0893            8
 3 2020-02-27 09:00:00 2020-02-27 16:00:00        0.432    0.477             7
 4 2020-03-13 13:00:00 2020-03-13 19:00:00        0.190    0.180             6
 5 2020-03-19 10:00:00 2020-03-19 16:00:00        0.191    0.181             6
 6 2020-03-29 23:00:00 2020-03-30 05:00:00        0.327    0.332             6
 7 2020-04-09 17:00:00 2020-04-09 23:00:00        0.204    0.197             6
 8 2020-04-13 19:00:00 2020-04-14 02:00:00        0.419    0.506             7
 9 2020-04-30 15:00:00 2020-04-30 22:00:00        0.214    0.218             7
10 2020-05-01 19:00:00 2020-05-02 00:00:00        0.811    1.28              5
# ℹ 505 more rows
# ℹ 1 more variable: site_name <fct>

4.2.1.2 Plot output

Distribution of lags

Constrain lag times to realistic values (>=0 and <= 24) as event pairing is not perfect, and view histograms by site

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot() + geom_histogram(aes(x = timediff_hrs)) + facet_wrap(~site_name)

View distributions summarized as boxplots. Sites are ordered from closest to Big G (bottom) to furthest (top). Interestingly, there is not a strong pattern of longer lag times for further sites.

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot() + geom_boxplot(aes(x = site_name, y = timediff_hrs)) + coord_flip()

Lag by yield

Does lag time depend on the magnitude of yield at big G? Under high flows when water is moving more quickly, we might expect the lag to be shorter (negative relationship).

View global relationship

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot(aes(x = yield_big, y = jitter(timediff_hrs))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot(aes(x = yield_big, y = jitter(timediff_hrs))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

There appears to be some evidence for shorter lag times with increasing flow, but this relationship is only evident for very low flows. What is more apparent is the overall attenuation in variability in lag time as flows increase: at very low flows, lags are highly variable, but less variable (and intermediate in magnitude) under high flows.

Lag by time

Does lag time change over time? Perhaps lag time is seasonal and changes with antecedant conditions. Note that this is not the best way to get at the question of importance of antecedant conditions.

View global relationship

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = jitter(timediff_hrs))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = jitter(timediff_hrs))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

There does not appear to be any consistent pattern (within or among sites) in how lag times change with time of year

4.2.2 Daily data

4.2.2.1 Conduct event pairing

Code
# baseflow separation and event delineation parameters
alp <- 0.925
numpass <- 3
thresh <- 0.5

# sites
sites <- unique(dat_1day$site_name)[1:9]

# empty list to store output
outlist <- list()

for (j in 1:length(sites)) {
  # grab big and little g data and combine into a single tibble
  littleg <- dat_1day %>% filter(site_name == sites[j])
  bigg <- dat_1day %>% filter(site_name == "West Brook NWIS")
  mytib <- bigg %>% select(date, Yield_filled_mm) %>% rename(yield_big = Yield_filled_mm) %>% left_join(littleg %>% select(date, Yield_filled_mm) %>% rename(yield_little = Yield_filled_mm))
  
  # baseflow separation
  mytib_bf <- mytib %>% 
  filter(!is.na(yield_big), !is.na(yield_little)) %>% 
  mutate(bf_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bf, 
         bfi_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bfi,
         bf_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bf, 
         bfi_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bfi)
  
  # delineate events
  events_little <- eventBaseflow(mytib_bf$yield_little, BFI_Th = thresh, bfi = mytib_bf$bfi_little)
  events_big <- eventBaseflow(mytib_bf$yield_big, BFI_Th = thresh, bfi = mytib_bf$bfi_big)
  
  # event matching
  mypairs <- pairEvents(events_little, events_big, lag = 5, type = 1)
  mypairs_com <- mypairs[complete.cases(mypairs),]
  
  # get matched event info
  matchpeaktib <- tibble(datetime_little = rep(NA, times = dim(mypairs_com)[1]), datetime_big = rep(NA, times = dim(mypairs_com)[1]),
                         yield_little = rep(NA, times = dim(mypairs_com)[1]), yield_big = rep(NA, times = dim(mypairs_com)[1]))
  for (i in 1:dim(mypairs_com)[1]) {
    matchpeaktib$datetime_little[i] <- mytib_bf$date[events_little$which.max[events_little$srt == mypairs_com$srt[i]]]
    matchpeaktib$datetime_big[i] <- mytib_bf$date[events_big$which.max[events_big$srt == mypairs_com$matched.srt[i]]]
    matchpeaktib$yield_little[i] <- events_little$max[events_little$srt == mypairs_com$srt[i]]
    matchpeaktib$yield_big[i] <- events_big$max[events_big$srt == mypairs_com$matched.srt[i]]
    }
  matchpeaktib <- matchpeaktib %>% mutate(datetime_little = as_date(datetime_little),
                                          datetime_big = as_date(datetime_big),
                                          timediff_dys = as.numeric(difftime(datetime_big, datetime_little), units = "days"),
                                          site_name = sites[j])
  
  # store output in list
  outlist[[j]] <- matchpeaktib
}
matchpeaktib <- do.call(rbind, outlist)
(matchpeaktib)
# A tibble: 337 × 6
   datetime_little datetime_big yield_little yield_big timediff_dys site_name  
   <date>          <date>              <dbl>     <dbl>        <dbl> <fct>      
 1 2020-02-27      2020-02-27         10.5       6.35             0 Avery Brook
 2 2020-03-29      2020-03-29          9.56      6.19             0 Avery Brook
 3 2020-04-13      2020-04-13          8.76      6.24             0 Avery Brook
 4 2020-05-01      2020-05-01         20.1      14.8              0 Avery Brook
 5 2020-05-16      2020-05-16          2.68      1.93             0 Avery Brook
 6 2020-07-03      2020-07-04          1.17      0.447            1 Avery Brook
 7 2020-07-09      2020-07-10          3.05      0.763            1 Avery Brook
 8 2020-07-17      2020-07-17          0.684     0.364            0 Avery Brook
 9 2020-07-23      2020-07-23          0.630     0.904            0 Avery Brook
10 2020-08-04      2020-08-05          0.615     0.229            1 Avery Brook
# ℹ 327 more rows

4.2.2.2 Plot output

Distribution of lags

Constrain lag times to realistic values (>=0 and <= 24) as event pairing is not perfect, and view histograms by site

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% ggplot() + geom_histogram(aes(x = timediff_dys)) + facet_wrap(~site_name)

View distributions summarized as means. Sites are ordered from closest to Big G (bottom) to furthest (top). There is general pattern of longer lag times for further sites.

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% group_by(site_name) %>% summarize(diffmean = mean(timediff_dys), diffsd = sd(timediff_dys)) %>% ggplot() + 
  geom_point(aes(x = site_name, y = diffmean)) + 
  #geom_errorbar(aes(x = site_name, ymin = diffmean - diffsd, ymax = diffmean + diffsd)) + 
  coord_flip()

Lag by yield

Does lag time depend on the magnitude of yield at big G? Under high flows when water is moving more quickly, we might expect the lag to be shorter (negative relationship).

View global relationship

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% ggplot(aes(x = yield_big, y = jitter(timediff_dys))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1)  %>% ggplot(aes(x = yield_big, y = jitter(timediff_dys))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

As with the hourly data, there appears to be some evidence for shorter lag times with increasing flow, but this relationship is only evident for very low flows. Although we note that 1-day lags are very rare (16% of all observations)

Lag by time

Does lag time change over time? Perhaps lag time is seasonal and changes with antecedant conditions. Note that this is not the best way to get at the question of importance of antecedant conditions.

View global relationship

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = jitter(timediff_dys))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = (timediff_dys))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

The prevalence of 1-day lags in peak flow generally peaks in mid summer (July) when flows are low

4.3 Dynamic time warping

Explore the use of dynamic time warping (Giorgino 2009) to align hourly time series data.

4.3.1 Select data

Trim to restricted period b/c DTW cannot handle very large datasets

Code
littleg <- dat_1hr %>% filter(site_name == "Avery Brook", datetime >= as_datetime("2020-03-01 00:00:00") & datetime <= as_datetime("2020-06-01 00:00:00"))
bigg <- dat_1hr %>% filter(site_name == "West Brook NWIS", datetime >= as_datetime("2020-03-01 00:00:00") & datetime <= as_datetime("2020-06-01 00:00:00"))

mytib <- bigg %>% select(datetime, yield) %>% rename(yield_big = yield) %>% left_join(littleg %>% select(datetime, yield) %>% rename(yield_little = yield))
mytib %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)")

4.3.2 Align data

Code
align1hr <- dtw(x = unlist(littleg %>% select(yield)), y = unlist(bigg %>% select(yield)), step = asymmetric, keep = TRUE)
str(align1hr)
List of 20
 $ costMatrix        : num [1:2209, 1:2209] 0.0076 0.0136 0.0195 0.0263 0.0328 ...
 $ directionMatrix   : int [1:2209, 1:2209] NA 1 1 1 1 1 1 1 1 1 ...
 $ stepPattern       : 'stepPattern' num [1:6, 1:4] 1 1 2 2 3 3 1 0 1 0 ...
  ..- attr(*, "npat")= num 3
  ..- attr(*, "norm")= chr "N"
 $ N                 : int 2209
 $ M                 : int 2209
 $ call              : language dtw(x = unlist(littleg %>% select(yield)), y = unlist(bigg %>% select(yield)),      step.pattern = asymmetric, ke| __truncated__
 $ openEnd           : logi FALSE
 $ openBegin         : logi FALSE
 $ windowFunction    :function (iw, jw, ...)  
 $ jmin              : int 2209
 $ distance          : num 35.6
 $ normalizedDistance: num 0.0161
 $ index1            : num [1:2209] 1 2 3 4 5 6 7 8 9 10 ...
 $ index2            : num [1:2209] 1 3 4 6 8 10 12 13 14 16 ...
 $ index1s           : num [1:2209] 1 2 3 4 5 6 7 8 9 10 ...
 $ index2s           : num [1:2209] 1 3 4 6 8 10 12 13 14 16 ...
 $ stepsTaken        : int [1:2208] 3 2 3 3 3 3 2 2 3 2 ...
 $ localCostMatrix   : 'crossdist' num [1:2209, 1:2209] 0.0076 0.006 0.00586 0.00683 0.00652 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  ..- attr(*, "method")= chr "Euclidean"
  ..- attr(*, "call")= language proxy::dist(x = x, y = y, method = dist.method)
 $ query             : num [1:2209, 1] 0.096 0.0976 0.0977 0.0967 0.097 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  .. ..$ : NULL
 $ reference         : num [1:2209, 1] 0.1036 0.1012 0.1005 0.0982 0.096 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  .. ..$ : NULL
 - attr(*, "class")= chr "dtw"

View index alignment

Code
plot(align1hr, type = "threeway")

Show aligned values

Code
plot(align1hr, type = "twoway", offset = - 1)

View aligned timeseries using dyGraphs. Clearly, this is not a great approach as it matches multiple query data points to the same reference index, i.e., the result is multiple little g flow readings at a single time point. As seen in the plots above, it also does not align the series correctly.

Code
aligneddata <- tibble(datetime = bigg$datetime[align1hr$index2], query = littleg$yield[align1hr$index1], reference = bigg$yield[align1hr$index2])
aligneddata %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)")